Wittenberg University - Master of Science in Analytics

ANLT 510 - Advanced Statistics and Modeling

Day 1: Regression models

14 Sep 2021

Introduction

Overview

+ Ordinary Least Squares
+ Principal Component Regression
+ Partial Least Squares Regression
+ Regularized Regression
+ Multivariate Adaptive Regression Splines

Prereqs

library(dplyr)
library(ggplot2)
library(rsample)
library(recipes)
library(vip)
library(caret)
library(broom)
library(plotly)
library(reshape2)
library(kableExtra)
library(mda)
library(AppliedPredictiveModeling)
library(earth)
# ames data
ames <- AmesHousing::make_ames()

# split data
set.seed(123)
split <- rsample::initial_split(ames, 
                                strata = "Sale_Price")
ames_train <- rsample::training(split)

Ordinary Least Squares

The Objective

Simple linear regression

model1 <- lm(Sale_Price ~ Gr_Liv_Area, 
             data = ames_train)
summary(model1)

Call:
lm(formula = Sale_Price ~ Gr_Liv_Area, data = ames_train)

Residuals:
    Min      1Q  Median      3Q     Max 
-467304  -30764   -2026   22770  330067 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 18332.350   3767.507   4.866 1.22e-06 ***
Gr_Liv_Area   107.935      2.373  45.478  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 57100 on 2195 degrees of freedom
Multiple R-squared:  0.4851,    Adjusted R-squared:  0.4849 
F-statistic:  2068 on 1 and 2195 DF,  p-value: < 2.2e-16

Multiple linear regression

# OLS model with two predictors
model2 <- lm(Sale_Price ~ Gr_Liv_Area + Year_Built, 
             data = ames_train)

# OLS model with specified interactions
model3 <- lm(Sale_Price ~ Gr_Liv_Area + Year_Built + Gr_Liv_Area : Year_Built, 
             data = ames_train)

# include all possible main effects
model4 <- lm(Sale_Price ~ ., 
             data = ames_train)

Assessing model accuracy

# create a resampling method
cv <- trainControl(
  method = "repeatedcv", 
  number = 10, 
  repeats = 5
  )

# model 1 CV
set.seed(123)
(cv_model1 <- train(
  Sale_Price ~ Gr_Liv_Area, 
  data = ames_train, 
  method = "lm", #<<
  trControl = cv)
)
Linear Regression 

2197 samples
   1 predictor

No pre-processing
Resampling: Cross-Validated (10 fold, repeated 5 times) 
Summary of sample sizes: 1978, 1976, 1979, 1977, 1977, 1977, ... 
Resampling results:

  RMSE     Rsquared   MAE     
  57064.5  0.4934857  38834.13

Tuning parameter 'intercept' was held constant at a value of TRUE
# model 2 CV
set.seed(123)
cv_model2 <- train(
  Sale_Price ~ Gr_Liv_Area + Year_Built, 
  data = ames_train, 
  method = "lm",
  trControl = cv
  )

# model 3 CV
set.seed(123)
cv_model3 <- train(
  Sale_Price ~ Gr_Liv_Area + Year_Built + Gr_Liv_Area : Year_Built, 
  data = ames_train, 
  method = "lm",
  trControl = cv
  )

# model 4 CV
set.seed(123)
cv_model4 <- train(
  Sale_Price ~ ., 
  data = ames_train, 
  method = "lm",
  trControl = cv
  )

# Extract out of sample performance measures
summary(resamples(list(
  model1 = cv_model1, 
  model2 = cv_model2, 
  model3 = cv_model3,
  model4 = cv_model4
)))

Call:
summary.resamples(object = resamples(list(model1 = cv_model1, model2
 = cv_model2, model3 = cv_model3, model4 = cv_model4)))

Models: model1, model2, model3, model4 
Number of resamples: 50 

MAE 
           Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
model1 34239.95 37540.34 38807.95 38834.13 39999.83 43924.26    0
model2 29089.50 30100.15 31610.44 31661.78 32793.11 36500.32    0
model3 27943.01 29397.47 30723.79 30693.46 31598.72 35276.60    0
model4 13476.65 15165.91 16261.38 16918.30 18392.34 23365.20    0

RMSE 
           Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
model1 48292.25 54293.13 56140.60 57064.50 60547.33 67494.36    0
model2 38340.61 43084.57 46000.66 47041.11 50886.38 57333.13    0
model3 36832.89 41665.44 45275.75 46641.25 51302.08 60088.92    0
model4 19124.73 21646.77 33815.16 35068.63 40018.94 68677.36    0

Rsquared 
            Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
model1 0.3096878 0.4503369 0.5012692 0.4934857 0.5413845 0.6673105    0
model2 0.5017891 0.6174076 0.6741613 0.6552257 0.6939632 0.7353204    0
model3 0.4558982 0.6202647 0.6896652 0.6628505 0.7110914 0.7597827    0
model4 0.5203830 0.7578330 0.8282277 0.8139947 0.9240234 0.9369869    0

Model concerns

Overview

## Concern #1: Assumed linear relationship

2. Constant variance among residuals
3. No autocorrelation
4. More observations than predictors
5. No or little multicollinearity

Concern #2: Assumption of Constant variance among residuals

Concern #3: Assume that no autocorrelation exists

Concer #4: More observations than predictors

y x1 x2 x3 x4 x5 x6 x7 x8 x9 x10
129002 1 2 3 6 6 2 3 7 2 5
371454 2 4 6 6 8 10 6 3 1 7
389726 6 4 10 6 1 9 7 6 10 2
309319 7 7 5 7 3 9 6 9 7 5
258595 1 6 9 6 3 6 2 6 3 5

Concern #5: Assume little or multicollinearity

m1 <- lm(Sale_Price ~ Gr_Liv_Area + TotRms_AbvGrd, data = ames_train)
m2 <- lm(Sale_Price ~ Gr_Liv_Area, data = ames_train)
m3 <- lm(Sale_Price ~ TotRms_AbvGrd, data = ames_train)

coef(m1) #<<
  (Intercept)   Gr_Liv_Area TotRms_AbvGrd 
   50199.8578      138.0862   -11952.0910 
coef(m2) #<<
(Intercept) Gr_Liv_Area 
 18332.3500    107.9354 
coef(m3) #<<
  (Intercept) TotRms_AbvGrd 
     26473.22      23854.15 

Principal Component Regression

The idea

R packages 📦

Implementation

# 1. hypergrid
hyper_grid <- expand.grid(ncomp = seq(2, 40, by = 2))

# 2. PCR
set.seed(123)
cv_pcr <- train(
  Sale_Price ~ ., 
  data = ames_train, 
  trControl = cv,
  method = "pcr", #<<
  preProcess = c("zv", "center", "scale"), #<<
  tuneGrid = hyper_grid, #<<
  metric = "RMSE"
  )

# model with lowest RMSE
cv_pcr$bestTune
   ncomp
20    40
cv_pcr$results %>%
  filter(ncomp == as.numeric(cv_pcr$bestTune))
  ncomp     RMSE Rsquared      MAE   RMSESD RsquaredSD    MAESD
1    40 33994.82 0.818078 21841.24 5749.197 0.05818264 1519.858
# plot cross-validated RMSE
plot(cv_pcr)

Tuning

# 1. hypergrid
p <- length(ames_train) - 1
hyper_grid <- expand.grid(ncomp = seq(2, 80, length.out = 10)) #<<

# 2. PCR
set.seed(123)
cv_pcr <- train(
  Sale_Price ~ ., 
  data = ames_train, 
  trControl = cv,
  method = "pcr", 
  preProcess = c("zv", "center", "scale"), 
  tuneGrid = hyper_grid, 
  metric = "RMSE"
  )

# RMSE
cv_pcr$results %>%
  filter(ncomp == cv_pcr$bestTune$ncomp)
     ncomp     RMSE  Rsquared      MAE   RMSESD RsquaredSD    MAESD
1 62.66667 33232.03 0.8260373 20933.53 5691.975 0.05761334 1326.798
# plot cross-validated RMSE
plot(cv_pcr)

Partial Least Squares Regression

The idea

R packages 📦

Implementation

# PLS
set.seed(123)
cv_pls <- train(
  Sale_Price ~ ., 
  data = ames_train, 
  trControl = cv,
  method = "pls", #<<
  preProcess = c("zv", "center", "scale"),
  tuneGrid = hyper_grid,
  metric = "RMSE"
  )

# model with lowest RMSE
cv_pls$bestTune
     ncomp
2 10.66667
cv_pls$results %>%
  filter(ncomp == as.numeric(cv_pls$bestTune))
     ncomp     RMSE  Rsquared      MAE   RMSESD RsquaredSD    MAESD
1 10.66667 31830.57 0.8410703 17840.71 9230.482 0.08628164 1660.355
# plot cross-validated RMSE
plot(cv_pls)

Tuning

Regularized Regression

The Idea

  1. Multicollinearity (we’ve already seen how PCR & PLS help to resolve this)
  2. Insufficient solution ( \(p >> n\) )
  3. Interpretability
    • Approach 1: model selection
      • computationally inefficient (Ames data: \(2^{80}\) models to evaluate)
      • simply assume a feature as in or out \(\rightarrow\) hard threshholding
    • Approach 2: regularize
      • retain all coefficients
      • slowly pushes a feature’s effect towards zero \(\rightarrow\) soft threshholding

Regular regression

\[\begin{equation} \text{minimize} \bigg \{ SSE = \sum^n_{i=1} (y_i - \hat{y}_i)^2 \bigg \} \end{equation}\]

Regularized regression

\[\begin{equation} \text{minimize} \big \{ SSE + P \big \} \end{equation}\]

Ridge regression

\[\begin{equation} \text{minimize } \bigg \{ SSE + \lambda \sum^p_{j=1} \beta_j^2 \bigg \} \end{equation}\]

Lasso regression

\[\begin{equation} \text{minimize } \bigg \{ SSE + \lambda \sum^p_{j=1} | \beta_j | \bigg \} \end{equation}\]

Elastic net regression

\[\begin{equation} \text{minimize } \bigg \{ SSE + \lambda_1 \sum^p_{j=1} \beta_j^2 + \lambda_2 \sum^p_{j=1} | \beta_j | \bigg \} \end{equation}\]

Tuning

caret::getModelInfo("glmnet")$glmnet$parameters
  parameter   class                    label
1     alpha numeric        Mixing Percentage
2    lambda numeric Regularization Parameter

R packages 📦

Implementation code chunk 11]

# tuning grid
hyper_grid <- expand.grid(
  alpha = seq(0, 1, by = .25),
  lambda = c(0.1, 10, 100, 1000, 10000)
)

# perform resampling
set.seed(123)
cv_glmnet <- train(
  Sale_Price ~ ., 
  data = ames_train, 
  trControl = cv,
  method = "glmnet", #<<
  preProcess = c("zv", "center", "scale"),
  tuneGrid = hyper_grid,
  metric = "RMSE"
  )

# best model
cv_glmnet$results %>%
  filter(
    alpha == cv_glmnet$bestTune$alpha,
    lambda == cv_glmnet$bestTune$lambda
    )
  alpha lambda     RMSE  Rsquared      MAE   RMSESD RsquaredSD    MAESD
1     0  10000 30343.01 0.8525313 17007.45 8249.958 0.07851523 1542.699
# plot results
plot(cv_glmnet)

Regularization gives us a slight improvement (~$1K)

Multivariate Adaptive Regression Splines

The idea

However, these require the user explicitly identify & incorporate

\[\begin{equation} \text{y} = \begin{cases} \beta_0 + \beta_1(1.183606 - \text{x}) & \text{x} < 1.183606, \\ \beta_0 + \beta_1(\text{x} - 1.183606) & \text{x} > 1.183606 \end{cases} \end{equation}\]

\[\begin{equation} \text{y} = \begin{cases} \beta_0 + \beta_1(1.183606 - \text{x}) & \text{x} < 1.183606, \\ \beta_0 + \beta_1(\text{x} - 1.183606) & \text{x} > 1.183606 \quad \& \quad \text{x} < 4.898114, \\ \beta_0 + \beta_1(4.898114 - \text{x}) & \text{x} > 4.898114 \end{cases} \end{equation}\]

.pull-right[

R packages 📦

Tuning parameters

caret::getModelInfo("earth")$earth$parameters
  parameter   class          label
1    nprune numeric         #Terms
2    degree numeric Product Degree

Implementation

# tuning grid
hyper_grid <- expand.grid(
  nprune = seq(2, 50, length.out = 10) %>% floor(),
  degree = 1:3
)

# perform resampling
set.seed(123)
cv_mars <- train(
  Sale_Price ~ ., 
  data = ames_train, 
  trControl = cv,
  method = "earth", #<<
  tuneGrid = hyper_grid,
  metric = "RMSE"
  )

# best model
cv_mars$results %>%
  filter(
    nprune == cv_mars$bestTune$nprune,
    degree == cv_mars$bestTune$degree
    )
  degree nprune     RMSE  Rsquared      MAE  RMSESD RsquaredSD    MAESD
1      2     50 25380.01 0.8975061 16079.21 6585.52 0.05291602 1243.379
# plot results
plot(cv_mars)

Model Comparison

Comparing error distributions

results <- resamples(list(
  OLS  = cv_model4, 
  PCR  = cv_pcr, 
  PLS  = cv_pls,
  EN   = cv_glmnet,
  MARS = cv_mars
  ))

summary(results)$statistics$RMSE
         Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
OLS  19124.73 21646.77 33815.16 35068.63 40018.94 68677.36    0
PCR  25286.01 27946.59 31652.92 33232.03 38461.54 43023.36    0
PLS  20694.64 24299.46 29282.11 31830.57 37308.30 55655.26    0
EN   19810.38 23775.84 27844.18 30343.01 35136.13 50509.66    0
MARS 19540.96 21180.00 22965.64 25380.01 25994.51 49383.62    0
p1 <- bwplot(results, metric = "RMSE")
p2 <- dotplot(results, metric = "RMSE")
gridExtra::grid.arrange(p1, p2, nrow = 1)

Feature Interpretation

Feature importance

vip(cv_mars)

Feature effects

pdp::partial(cv_model2, 
             pred.var = "Gr_Liv_Area", 
             grid.resolution = 10) %>% 
  autoplot()

pdp::partial(cv_mars, 
             pred.var = c("Gr_Liv_Area", "Year_Built"), 
             grid.resolution = 10) %>% 
  pdp::plotPartial(levelplot = FALSE, 
                   zlab = "yhat", 
                   drape = TRUE, 
                   colorkey = TRUE, 
                   screen = list(z = -20, x = -60))

Wrapping up

Summary

Questions?